### ---- SCRIPT 04: The purpose of this script is to provide descriptive statistics for the 825 unique guests ---- ###
### ---- who have appeared on any of the seven selected programmes at any point. This includes summarising the ---- ###
### ---- breakdown of guest category and sub-category types, nationality, and distribution of ideal points by ---- ###
### ---- category type. It also summarises the Twitter profile metadata of guests and wider user groups.      ---- ###

options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visulisation
library(data.table) # big data
library(treemapify) # generate treemap plots
library(ggridges) # generate ridge plots 

#### ---- LOAD DATA ---- ####

# Guest masterlist
guest_data <- fread("username_master_list_with_ideal_points.csv")

# TV guest lists
guest_lists <- list.files("guest_lists/",full.names = TRUE) %>%
  lapply(fread)

# Twitter user data 
user_data <- fread("user_ideal_point_data_updated.csv")

#### ---- SUMMARISE TWITTER PROFILE METADATA ---- ####

# Create a dataframe that duplicates the user data as new guest category so that they can all be summarised as
# together in one table

user_data_dupe <- user_data 
user_data_dupe$elite_account <- "Overall"

guest_data_dupe <- guest_data %>% filter(!is.na(user_ideal_point)) 
guest_data_dupe$elite_account <- "Guest"

user_data_dupe <- rbind(user_data,user_data_dupe,guest_data_dupe, fill=TRUE)

user_data_summary <- user_data_dupe %>% 
  group_by(elite_account) %>%
  summarise(count = n(),
            verified = round(sum(verified,na.rm = T)/sum(count),3),
            median_mps_followed = median(mps_followed,na.rm = T),
            median_followers = median(followers_count,na.rm = T),
            median_following = median(following_count,na.rm = T),
            median_tweets = median(tweet_count,na.rm = T),
            median_listed = median(listed_count,na.rm = T),
            median_ideal = median(user_ideal_point,na.rm = T))

#### ---- SUMMARISE GUEST CATEGORIES ---- ####

# Firstly, summarise the overall guest data by main category type
guest_catergories_summary <- guest_data %>% 
  group_by(category_type) %>%
  summarise(count = n())

# Plot as a bar chart 
guest_catergories_summary$perc <- guest_catergories_summary$count / sum(guest_catergories_summary$count) * 100

guest_catergories_summary$category_type <- factor(guest_catergories_summary$category_type, 
                                            levels = guest_catergories_summary$category_type[order(guest_catergories_summary$count, decreasing = FALSE)])

guest_categories_barchart <- ggplot(guest_catergories_summary, aes(x = category_type, y = count)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.7, alpha = 0.8) +
  geom_text(aes(label = paste0(round(perc), "%"), y = count), hjust = -0.1, size = 3) +
  labs(title = "",
       x = "",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip()

ggsave("guest_categories_barchart.png",
       guest_categories_barchart,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Secondly, summarise the overall guest data by main category type *and* subcategory of each group
guest_subcatergories_summary <- guest_data %>% 
  group_by(category_type,category_name_1) %>%
  summarise(count = n())

# Plot as a treemap
subcategory_treemap <- ggplot(guest_subcatergories_summary, aes(area = count, label = category_name_1, fill = category_type,
                                                 subgroup = category_type)) +
  geom_treemap()  +
  geom_treemap_subgroup_border() +
  geom_treemap_subgroup_text(place = "centre", grow = T, alpha = 0.5, colour =
                               "black", fontface = "italic", min.size = 0) +
  geom_treemap_text(colour = "white", place = "topleft", reflow = T) +
  theme(legend.position = "")

ggsave("guest_sub_categories_treemap.png",
       subcategory_treemap ,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Thirdly, summarise the overall guest data by organisation
guest_organisation_summary <- guest_data %>% 
  group_by(organisation) %>%
  summarise(count = n())

# Plot as a bar chart 
guest_organisation_summary$perc <- guest_organisation_summary$count / sum(guest_organisation_summary$count) * 100

guest_organisation_summary$organisation <- factor(guest_organisation_summary$organisation, 
                                                  levels = guest_organisation_summary$organisation[order(guest_organisation_summary$count, decreasing = FALSE)])

# Select top 25 to plot
guest_organisation_summary <- head(guest_organisation_summary[order(guest_organisation_summary$count, decreasing = TRUE),],25)


guest_organisation_barchart <- ggplot(guest_organisation_summary, aes(x = organisation, y = count)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.7, alpha = 0.8) +
  geom_text(aes(label = paste0(round(perc,1), "%"), y = count), hjust = -0.1, size = 3) +
  labs(title = "",
       x = "",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip() +
  scale_y_continuous(limits = c(0,175))

ggsave("guest_organisation_barchart.png",
       guest_organisation_barchart,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Finally, summarise guest data by nationality
guest_nationality_summary <- guest_data %>% 
  group_by(nationality_type) %>%
  summarise(count = n())

# Plot as a bar chart 
guest_nationality_summary$perc <- guest_nationality_summary$count / sum(guest_nationality_summary$count) * 100

guest_nationality_summary$nationality_type <- factor(guest_nationality_summary$nationality_type,levels = c("Non-British","British")) 

guest_nationality_barchart <- ggplot(guest_nationality_summary, aes(x = nationality_type, y = count)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.7, alpha = 0.8) +
  geom_text(aes(label = paste0(round(perc), "%"), y = count), hjust = -0.1, size = 3) +
  labs(title = "",
       x = "",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip() 

ggsave("guest_nationalities_barchart.png",
       guest_nationality_barchart,
       units="in", width=7, height=4, dpi=300,
       bg="white")

#### ---- GUEST CATEGORY IDEAL POINT DISTRIBUTIONS ---- ####

# User ideal points by category type 

# Order categories by median ideal point for benefit of plot interpretability
category_ideal_point_plot_ordering <- guest_data %>% 
  group_by(category_type) %>%
  summarise(median_ideal_point = median(user_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(category_type)

guest_data$category_type <- factor(guest_data$category_type, levels = category_ideal_point_plot_ordering)

# Raincloud distribution plot

ideal_point_distributions_by_cat <- ggplot(guest_data, aes(x = user_ideal_point, y = category_type, fill = category_type)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.03, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 1, 0.25),
                     labels = c("Left", "0.25", "0.5", "0.75", "Right"),
                     limits = c(0, 1)) +
  labs(y = "", x = "Ideal Point") +
  theme(axis.title.y = element_blank(), 
        legend.position = "none") +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "black")

ggsave("ideal_point_by_category_raincloud_plot.png",
       ideal_point_distributions_by_cat, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

#### ---- END ---- ####
